Natural sequence and DMS analysis¶

In [1]:
# Imports
import os
import json
import warnings
import dmslogo
import neutcurve
import numpy as np
import scipy as sp
import pandas as pd
import altair as alt
import seaborn as sns
import statsmodels.api
import matplotlib.colors
from Bio import SeqIO, AlignIO 
from matplotlib import pyplot as plt
from matplotlib import ticker as mticker

# Rearranged to make the tree look nicer
# re-ordered
tol_muted_adjusted = [
    "#000000",
    "#CC6677", 
    "#1f78b4", 
    "#88CCEE",
    "#DDDDDD",
    "#882255",
    "#117733", 
    "#DDCC77",
    "#44AA99", 
    "#EE7733",
    "#AA4499",
    "#999933", 
    "#CC3311",
]

# Seaborn style settings
sns.set(rc={
    "figure.dpi":300, 
    "savefig.dpi":300,
    "svg.fonttype":"none",
})
sns.set_style("ticks")
sns.set_palette(tol_muted_adjusted)

# Suppress warnings
warnings.simplefilter("ignore")

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
filtered_escape_377H = None
filtered_escape_89F = None
filtered_escape_2510C = None
filtered_escape_121F = None
filtered_escape_256A = None
filtered_escape_372D = None

func_scores = None
min_times_seen = None
n_selections = None

GPC_tree_mutations = None
GPC_FEL_results = None
GPC_FUBAR_results = None

natural_seq_metadata = None
natural_seq_alignment = None

out_dir_natural = None
ols_regression = None
In [3]:
# Parameters
filtered_escape_377H = (
    "results/filtered_antibody_escape_CSVs/377H_filtered_mut_effect.csv"
)
filtered_escape_89F = (
    "results/filtered_antibody_escape_CSVs/89F_filtered_mut_effect.csv"
)
filtered_escape_2510C = (
    "results/filtered_antibody_escape_CSVs/2510C_filtered_mut_effect.csv"
)
filtered_escape_121F = (
    "results/filtered_antibody_escape_CSVs/121F_filtered_mut_effect.csv"
)
filtered_escape_256A = (
    "results/filtered_antibody_escape_CSVs/256A_filtered_mut_effect.csv"
)
filtered_escape_372D = (
    "results/filtered_antibody_escape_CSVs/372D_filtered_mut_effect.csv"
)
func_scores = "results/func_effects/averages/293T_entry_func_effects.csv"
GPC_tree_mutations = (
    "non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_tree_mutations.csv"
)
GPC_FEL_results = (
    "non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_FEL_results.json"
)
GPC_FUBAR_results = (
    "non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_FUBAR_results.json"
)
natural_seq_metadata = (
    "non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_S_segment_metadata.tsv"
)
natural_seq_alignment = "non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"
out_dir_natural = "results/natural_isolate_escape/"
min_times_seen = 2
n_selections = 8
ols_regression = "results/natural_isolate_escape/ols_regression.html"
In [4]:
# # Uncomment for running interactive
# filtered_escape_377H = "../results/filtered_antibody_escape_CSVs/377H_filtered_mut_effect.csv"
# filtered_escape_89F = "../results/filtered_antibody_escape_CSVs/89F_filtered_mut_effect.csv"
# filtered_escape_2510C = "../results/filtered_antibody_escape_CSVs/2510C_filtered_mut_effect.csv"
# filtered_escape_121F = "../results/filtered_antibody_escape_CSVs/121F_filtered_mut_effect.csv"
# filtered_escape_256A = "../results/filtered_antibody_escape_CSVs/256A_filtered_mut_effect.csv"
# filtered_escape_372D = "../results/filtered_antibody_escape_CSVs/372D_filtered_mut_effect.csv"

# func_scores = "../results/func_effects/averages/293T_entry_func_effects.csv"
# min_times_seen = 2
# n_selections = 8

# GPC_tree_mutations = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_tree_mutations.csv"
# GPC_FEL_results = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_FEL_results.json"
# GPC_FUBAR_results = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_FUBAR_results.json"

# natural_seq_metadata = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_S_segment_metadata.tsv"
# natural_seq_alignment = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"

# out_dir_natural = "../results/natural_isolate_escape/"
# ols_regression = "../results/natural_isolate_escape/ols_regression.html"

Process functional scores and antibody escape to create one dataframe with site level information.

In [5]:
# Read functional score data
functional_scores = pd.read_csv(func_scores)

# Filter for minimum selections, times seen and no stop codons
functional_scores = (
    functional_scores.query(
        "n_selections >= @n_selections and times_seen >= @min_times_seen and mutant != '*'"
    )
    .drop(columns=["mutant", "times_seen"])
    .groupby(["site", "wildtype"])
    .aggregate({
        "effect" : "mean"
    })
    .reset_index()
)

# Initialize list of escape files and final df
antibody_files = [
    filtered_escape_377H,
    filtered_escape_89F,
    filtered_escape_2510C,
    filtered_escape_121F,
    filtered_escape_256A,
    filtered_escape_372D,
]
antibody_escape = pd.DataFrame(columns=["site", "wildtype"])

# Add escape to dataframe for each antibody
for index,antibody_file in enumerate(antibody_files):

    antibody_name = antibody_file.split("/")[-1].split("_")[0]

    # Load data as dataframe
    escape_df = pd.read_csv(antibody_file)

    # Clip lower scores to 0
    escape_df["escape_median"] = escape_df["escape_median"].clip(lower=0)

    # Rename escape column to include antibody name
    escape_df = (
        escape_df
        .rename(columns={"escape_median" : "escape_" + antibody_name})
        .groupby(["site", "wildtype"])
        .aggregate({"escape_" + antibody_name : "sum"})
        .reset_index()
    )

    if index == 0:
        antibody_escape = (
            pd.concat([
                antibody_escape, 
                escape_df[["site", "wildtype", "escape_" + antibody_name]],
            ])
        )
    else:
        # Merge dataframes
        antibody_escape = (
            antibody_escape.merge(
                escape_df[["site", "wildtype", "escape_" + antibody_name]],
                how="left",
                on=["site", "wildtype",],
                validate="one_to_one",
            )
        )

# Create new column with antibody summed escape
antibody_escape["total_escape"] = (
    antibody_escape[[
        "escape_377H",
        "escape_89F",
        "escape_2510C",
        "escape_121F",
        "escape_256A",
        "escape_372D",
    ]].sum(axis=1)
)

merged_df = (
    functional_scores.merge(
        antibody_escape,
        how="left",
        on=["site", "wildtype"],
        validate="one_to_one",
    )
)

# Load tree mutation counts
tree_mutations = pd.read_csv(GPC_tree_mutations)

# Calculate counts of synonymous and nonsynonous mutations and ratio
tree_mutations["synonymous"] = (
    tree_mutations["mut_type"].apply(lambda x: 1 if x == "synonymous" else 0)
)
tree_mutations["nonsynonymous"] = (
    tree_mutations["mut_type"].apply(lambda x: 1 if x == "nonsynonymous" else 0)
)
tree_mutations = (
    tree_mutations
    .groupby(["site"])
    .aggregate({
        "mut_type" : "count",
        "synonymous" : "sum",
        "nonsynonymous" : "sum",
    })
    .reset_index()
)
tree_mutations["non/syn"] = tree_mutations["nonsynonymous"] / tree_mutations["synonymous"]

# Merge tree mutation counts
merged_df = (
    merged_df.merge(
        tree_mutations,
        how="left",
        on=["site"],
        validate="one_to_one",
    )
)

# Load FEL data
FEL_results = pd.read_json(GPC_FEL_results)

# Process json file and extract site info
FEL_results = (
    FEL_results
    .reset_index()
)
FEL_results = (
    pd.DataFrame(FEL_results.at[0, "MLE"])
    .rename(columns={"0" : "data"})
)
headers = [
    "FEL_alpha",
    "FEL_beta",
    "FEL_alpha=beta",
    "FEL_LRT",
    "FEL_p-value",
    "FEL_Total branch length",
    "FEL_p-asmp",
]
FEL_results[headers] = pd.DataFrame(
    FEL_results["data"].tolist(), 
    index=FEL_results.index,
)
FEL_results = (
    FEL_results
    .reset_index(drop=False)
    .rename(columns={"index" : "site"})
    .drop(columns=["data"])
)
FEL_results["site"] = FEL_results["site"] + 1

# Merge tree mutation counts
merged_df = (
    merged_df.merge(
        FEL_results,
        how="left",
        on=["site"],
        validate="one_to_one",
    )
)

# Load FUBAR data
# requires json load because the output file
# has mixed dict and series
json_data = json.load(open(GPC_FUBAR_results))
FUBAR_results = (
    pd.DataFrame(json_data["MLE"]["content"])
    .rename(columns={"0" : "data"})
)

# Process json file and extract site info
headers = [
    "FUBAR_alpha",
    "FUBAR_beta",
    "FUBAR_beta-alpha",
    "FUBAR_Prob[alpha>beta]",
    "FUBAR_Prob[alpha<beta]",
    "FUBAR_BayesFactor[alpha<beta]",
    "empty_1", # dummy columns
    "empty_2", # dummy columns
]
FUBAR_results[headers] = pd.DataFrame(
    FUBAR_results["data"].tolist(), 
    index=FUBAR_results.index,
)
FUBAR_results = (
    FUBAR_results
    .reset_index(drop=False)
    .rename(columns={"index" : "site"})
    .drop(columns=["data", "empty_1", "empty_2"])
)
FUBAR_results["site"] = FUBAR_results["site"] + 1

# Merge tree mutation counts
merged_df = (
    merged_df.merge(
        FUBAR_results,
        how="left",
        on=["site"],
        validate="one_to_one",
    )
)

merged_df.info()
merged_df
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   site                           491 non-null    object 
 1   wildtype                       491 non-null    object 
 2   effect                         491 non-null    float64
 3   escape_377H                    442 non-null    float64
 4   escape_89F                     441 non-null    float64
 5   escape_2510C                   426 non-null    float64
 6   escape_121F                    428 non-null    float64
 7   escape_256A                    425 non-null    float64
 8   escape_372D                    441 non-null    float64
 9   total_escape                   442 non-null    float64
 10  mut_type                       474 non-null    float64
 11  synonymous                     474 non-null    float64
 12  nonsynonymous                  474 non-null    float64
 13  non/syn                        474 non-null    float64
 14  FEL_alpha                      491 non-null    float64
 15  FEL_beta                       491 non-null    float64
 16  FEL_alpha=beta                 491 non-null    float64
 17  FEL_LRT                        491 non-null    float64
 18  FEL_p-value                    491 non-null    float64
 19  FEL_Total branch length        491 non-null    float64
 20  FEL_p-asmp                     491 non-null    int64  
 21  FUBAR_alpha                    491 non-null    float64
 22  FUBAR_beta                     491 non-null    float64
 23  FUBAR_beta-alpha               491 non-null    float64
 24  FUBAR_Prob[alpha>beta]         491 non-null    float64
 25  FUBAR_Prob[alpha<beta]         491 non-null    float64
 26  FUBAR_BayesFactor[alpha<beta]  491 non-null    float64
dtypes: float64(24), int64(1), object(2)
memory usage: 103.7+ KB
Out[5]:
site wildtype effect escape_377H escape_89F escape_2510C escape_121F escape_256A escape_372D total_escape ... FEL_LRT FEL_p-value FEL_Total branch length FEL_p-asmp FUBAR_alpha FUBAR_beta FUBAR_beta-alpha FUBAR_Prob[alpha>beta] FUBAR_Prob[alpha<beta] FUBAR_BayesFactor[alpha<beta]
0 1 M -4.155000 NaN NaN NaN NaN NaN NaN NaN ... 0.000000 1.000000e+00 0.000000 0 2.704564 0.002811 -2.701752 0.997722 1.655895e-04 1.011602e-03
1 2 G -4.056263 NaN NaN NaN NaN NaN NaN NaN ... 94.745563 0.000000e+00 9.143795 0 2.814837 0.004233 -2.810604 1.000000 1.113056e-23 6.798646e-23
2 3 Q -3.179526 1.193990 0.20157 0.961580 0.495290 0.238989 0.871600 3.963019 ... 56.382157 5.961898e-14 17.160898 0 2.821999 0.086386 -2.735613 1.000000 2.994383e-15 1.828995e-14
3 4 I -3.150942 0.000000 0.70370 0.000000 0.374100 0.015481 0.000000 1.093281 ... 64.579451 8.881784e-16 14.001125 0 2.816045 0.282695 -2.533350 1.000000 1.152638e-13 7.040412e-13
4 5 V -2.146336 0.657288 0.15390 0.637900 0.147550 0.607944 0.032400 2.236982 ... 32.439042 1.229905e-08 16.644957 0 5.458957 0.552020 -4.906937 1.000000 1.882220e-11 1.149676e-10
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
486 487 V -2.555938 0.000000 1.30182 0.274350 0.017900 0.102483 0.622140 2.318693 ... 106.375165 0.000000e+00 20.544976 0 3.266311 0.158815 -3.107496 1.000000 5.728720e-24 3.499152e-23
487 488 K -1.083361 0.870287 0.90637 0.536484 0.304730 0.026117 1.357948 4.001936 ... 34.183572 5.015054e-09 28.195301 0 7.025409 1.187379 -5.838030 0.999995 2.197589e-15 1.342307e-14
488 489 W -3.742706 NaN NaN NaN NaN NaN NaN NaN ... 0.000000 1.000000e+00 0.000000 0 2.725774 0.011578 -2.714195 0.996387 1.082165e-03 6.617119e-03
489 490 K -0.990541 2.232642 0.43778 1.541050 0.863020 0.415287 0.346231 5.836010 ... 54.779059 1.348921e-13 11.131260 0 2.814756 0.082011 -2.732745 1.000000 9.728127e-15 5.942026e-14
490 491 R -1.267112 1.104519 0.61278 0.440502 0.263633 0.687325 0.065475 3.174234 ... 10.799155 1.015464e-03 2.095351 0 0.782909 0.084182 -0.698727 0.999168 1.602841e-04 9.791862e-04

491 rows × 27 columns

Correlation of phylogenetic tree metrics¶

In [6]:
comparisons = [
    ("FEL_beta", "nonsynonymous"),
    ("FUBAR_beta", "nonsynonymous"),
    ("FUBAR_beta", "FEL_beta"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].dropna()[stat1],
        merged_df[[stat1, stat2]].dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

nonsyn = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of nonsynonymous mutation metrics"],
)


comparisons = [
    ("FEL_alpha", "synonymous"),
    ("FUBAR_alpha", "synonymous"),
    ("FUBAR_alpha", "FEL_alpha"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].dropna()[stat1],
        merged_df[[stat1, stat2]].dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

syn = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of synonymous mutation metrics"],
)


comparisons = [
    ("FEL_alpha=beta", "non/syn"),
    ("FUBAR_beta-alpha", "non/syn"),
    ("FUBAR_beta-alpha", "FEL_alpha=beta"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat1],
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

nonsyn_and_syn = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of combined nonsynonymous and synonymous mutation metrics"],
)

combined_plot = alt.vconcat(
    nonsyn,
    syn,
    nonsyn_and_syn,
    spacing=5,
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

combined_plot
r correlation of FEL_beta and nonsynonymous: 0.57
r correlation of FUBAR_beta and nonsynonymous: 0.53
r correlation of FUBAR_beta and FEL_beta: 0.98
r correlation of FEL_alpha and synonymous: 0.36
r correlation of FUBAR_alpha and synonymous: 0.47
r correlation of FUBAR_alpha and FEL_alpha: 0.45
r correlation of FEL_alpha=beta and non/syn: 0.22
r correlation of FUBAR_beta-alpha and non/syn: 0.15
r correlation of FUBAR_beta-alpha and FEL_alpha=beta: -0.14
Out[6]:

The lower end of nonsynonymous mutations (i.e., 0 to 10) are less correlated even though the correlation looks really good when zoomed out.

Tree mutations compared to functional effects and antibody escape¶

In [7]:
comparisons = [
    ("effect", "nonsynonymous"),
    ("effect", "synonymous"),
    ("effect", "non/syn"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat1],
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

effects = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of functional effects and tree mutation counts"],
)


comparisons = [
    ("total_escape", "nonsynonymous"),
    ("total_escape", "synonymous"),
    ("total_escape", "non/syn"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat1],
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

escape = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of antibody escape and tree mutation counts"],
)

combined_plot = alt.vconcat(
    effects,
    escape,
    spacing=5,
    title="Tree mutations",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

combined_plot
r correlation of effect and nonsynonymous: 0.21
r correlation of effect and synonymous: -0.14
r correlation of effect and non/syn: 0.17
r correlation of total_escape and nonsynonymous: -0.00
r correlation of total_escape and synonymous: -0.07
r correlation of total_escape and non/syn: -0.00
Out[7]:

FEL analysis compared to functional effects and antibody escape¶

In [8]:
comparisons = [
    ("effect", "FEL_beta"),
    ("effect", "FEL_alpha"),
    ("effect", "FEL_alpha=beta"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat1],
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

effects = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of functional effects and FEL metrics"],
)


comparisons = [
    ("total_escape", "FEL_beta"),
    ("total_escape", "FEL_alpha"),
    ("total_escape", "FEL_alpha=beta"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat1],
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

escape = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of antibody escape and FEL metrics"],
)

combined_plot = alt.vconcat(
    effects,
    escape,
    spacing=5,
    title="FEL",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

combined_plot
r correlation of effect and FEL_beta: 0.31
r correlation of effect and FEL_alpha: -0.01
r correlation of effect and FEL_alpha=beta: 0.08
r correlation of total_escape and FEL_beta: 0.01
r correlation of total_escape and FEL_alpha: -0.02
r correlation of total_escape and FEL_alpha=beta: -0.07
Out[8]:

FUBAR analysis compared to functional effects and antibody escape¶

In [9]:
comparisons = [
    ("effect", "FUBAR_beta"),
    ("effect", "FUBAR_alpha"),
    ("effect", "FUBAR_beta-alpha"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat1],
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

effects = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of functional effects and FUBAR metrics"],
)


comparisons = [
    ("total_escape", "FUBAR_beta"),
    ("total_escape", "FUBAR_alpha"),
    ("total_escape", "FUBAR_beta-alpha"),
]
subplots = []
for pair in comparisons:
    stat1 = pair[0]
    stat2 = pair[1]

    # Calculate correlation 
    r, p = sp.stats.pearsonr(
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat1],
        merged_df[[stat1, stat2]].replace([np.inf, -np.inf], np.nan).dropna()[stat2],
    )
    print(f"r correlation of {stat1} and {stat2}: {r:.2f}")
    
    curr_subplot = alt.Chart(
        merged_df, 
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.Y(
            stat1,
            axis=alt.Axis(
                title=stat1,
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(domain=[-4.1,1.1])
        ),
        alt.X(
            stat2,
            axis=alt.Axis(
                title=stat2,
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "total_escape",
            "nonsynonymous",
            "synonymous",
            "non/syn",
            "FEL_beta",
            "FEL_alpha",
            "FEL_alpha=beta",
            "FUBAR_beta",
            "FUBAR_alpha",
            "FUBAR_beta-alpha",
        ],
    ).properties(
        width=300,
        height=300,
    )

    subplots.append(curr_subplot)

escape = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    title=["Correlations of antibody escape and FUBAR metrics"],
)

combined_plot = alt.vconcat(
    effects,
    escape,
    spacing=5,
    title="FUBAR",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

combined_plot
r correlation of effect and FUBAR_beta: 0.30
r correlation of effect and FUBAR_alpha: 0.09
r correlation of effect and FUBAR_beta-alpha: 0.03
r correlation of total_escape and FUBAR_beta: 0.01
r correlation of total_escape and FUBAR_alpha: 0.06
r correlation of total_escape and FUBAR_beta-alpha: -0.05
Out[9]:

OLS regression for DMS measurements and nonsynonymous substitution metrics¶

Perform multiple linear regression between the DMS measurements (i.e., effect on entry and antibody escape) and the nonsynonymous mutation metrics. We perform OLS regression for both the metric as is and for the log 2 of the metric. Note that when taking the log2 of the metric, only sites with nonzero values remain (i.e., all sites that did not have any mutations observed are not included). We finally plot the predicted metric from the OLS model versus the actual value.

In [10]:
ols_df = (
    merged_df[[
        "site",
        "wildtype",
        "FEL_beta",
        "nonsynonymous",
        "FUBAR_beta",
        "effect",
        "escape_377H",
        "escape_2510C",
        "escape_89F",
        "escape_256A",
        "escape_372D",
        "escape_121F",
        "total_escape",
    ]]
    # .replace([np.inf, -np.inf], np.nan)
    .rename(columns={
        "effect" : "effect on cell entry",
        "total_escape" : "antibody escape",
    })
    .dropna() # some sites do not have measurements for antibody escape
    .reset_index(drop=True)
)


X_vars = [
    # "site",
    "effect on cell entry",
    # "escape_377H",
    # "escape_2510C",
    # "escape_89F",
    # "escape_256A",
    # "escape_372D",
    # "escape_121F",
    "antibody escape",
]

Y_vars = [
    "nonsynonymous",
    "FEL_beta",
    "FUBAR_beta",
]

subplots = []
for Y_var in Y_vars:
    
    Y_values = ols_df[[Y_var]]
    X_values = ols_df[X_vars]

    # https://www.statsmodels.org/dev/examples/notebooks/generated/ols.html
    ols_model = statsmodels.api.OLS(
        endog=Y_values,
        exog=statsmodels.api.add_constant(X_values.astype(float)),
    )
    res_ols = ols_model.fit()
    full_r2 = res_ols.rsquared
    # print(res_ols.summary())
    ols_df[Y_var+"_predicted"] = res_ols.predict()

    # Calculate correlation 
    actual_r, p = sp.stats.pearsonr(
        ols_df[[Y_var, Y_var+"_predicted"]].replace([np.inf, -np.inf], np.nan).dropna()[Y_var+"_predicted"],
        ols_df[[Y_var, Y_var+"_predicted"]].replace([np.inf, -np.inf], np.nan).dropna()[Y_var],
    )
    print(f"r correlation of predicted {Y_var} vs actual {Y_var}: {actual_r:.2f}")
    
    unique_var = {}
    # https://blog.minitab.com/en/adventures-in-statistics-2/how-to-identify-the-most-important-predictor-variables-in-regression-models
    for vremove in X_vars:
        vremove_ols_model = statsmodels.api.OLS(
            endog=Y_values,
            exog=statsmodels.api.add_constant(ols_df[[v for v in X_vars if v != vremove]].astype(float)),
        )
        vremove_res_ols = vremove_ols_model.fit()
        unique_var[vremove] = full_r2 - vremove_res_ols.rsquared
    
    # https://stackoverflow.com/a/53966201
    subtitle = [
        f"{var}: {unique_var[var] * 100:.1f}% of variance (coef {res_ols.params[var]:.3f} \u00B1 {res_ols.bse[var]:.3f})"
        for var in X_vars
    ]

    chart_title = alt.TitleParams(
        Y_var + " (n = " + str(Y_values.shape[0]) + ")",
        subtitle=subtitle,
        fontSize=16,
    )

    curr_subplot = alt.Chart(
        ols_df,
        title=chart_title,
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.X(
            Y_var+"_predicted",
            axis=alt.Axis(
                title=["predicted " + Y_var],
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        alt.Y(
            Y_var,
            axis=alt.Axis(
                title=["actual " + Y_var],
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            Y_var,
            Y_var+"_predicted",
            "effect on cell entry",
            "escape_377H",
            "escape_2510C",
            "escape_89F",
            "escape_256A",
            "escape_372D",
            "escape_121F",
            "antibody escape",
            "nonsynonymous",
            "FEL_beta",
            "FUBAR_beta",
            # "synonymous",
            # "non/syn",
            # "FEL_beta",
            # "FEL_alpha",
            # "FEL_alpha=beta",
            # "FUBAR_beta",
            # "FUBAR_alpha",
            # "FUBAR_beta-alpha",
        ],
    ).properties(
        width=400,
        height=400,
    )

    subplots.append(curr_subplot)

no_log_plots = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
    # title=["Correlations of predicted vs actual substitution metrics"],
)


ols_df["log2 nonsynonymous"] = np.log2(ols_df["nonsynonymous"])
ols_df["log2 FEL_beta"] = np.log2(ols_df["FEL_beta"])
ols_df["log2 FUBAR_beta"] = np.log2(ols_df["FUBAR_beta"])
ols_df = (
    ols_df
    .replace([np.inf, -np.inf], np.nan) 
    .dropna()
    .reset_index(drop=True)
)


Y_vars = [
    "log2 nonsynonymous",
    "log2 FEL_beta",
    "log2 FUBAR_beta",
]

subplots = []
for Y_var in Y_vars:
    
    Y_values = ols_df[[Y_var]]
    X_values = ols_df[X_vars]

    # https://www.statsmodels.org/dev/examples/notebooks/generated/ols.html
    ols_model = statsmodels.api.OLS(
        endog=Y_values,
        exog=statsmodels.api.add_constant(X_values.astype(float)),
    )
    res_ols = ols_model.fit()
    full_r2 = res_ols.rsquared
    # print(res_ols.summary())
    ols_df[Y_var+"_predicted"] = res_ols.predict()

    # Calculate correlation 
    actual_r, p = sp.stats.pearsonr(
        ols_df[[Y_var, Y_var+"_predicted"]].replace([np.inf, -np.inf], np.nan).dropna()[Y_var+"_predicted"],
        ols_df[[Y_var, Y_var+"_predicted"]].replace([np.inf, -np.inf], np.nan).dropna()[Y_var],
    )
    print(f"r correlation of predicted {Y_var} vs actual {Y_var}: {actual_r:.2f}")
    
    unique_var = {}
    # https://blog.minitab.com/en/adventures-in-statistics-2/how-to-identify-the-most-important-predictor-variables-in-regression-models
    for vremove in X_vars:
        vremove_ols_model = statsmodels.api.OLS(
            endog=Y_values,
            exog=statsmodels.api.add_constant(ols_df[[v for v in X_vars if v != vremove]].astype(float)),
        )
        vremove_res_ols = vremove_ols_model.fit()
        unique_var[vremove] = full_r2 - vremove_res_ols.rsquared
    
    # https://stackoverflow.com/a/53966201
    subtitle = [
        f"{var}: {unique_var[var] * 100:.1f}% of variance (coef {res_ols.params[var]:.3f} \u00B1 {res_ols.bse[var]:.3f})"
        for var in X_vars
    ]

    chart_title = alt.TitleParams(
        Y_var + " (n = " + str(Y_values.shape[0]) + ")",
        subtitle=subtitle,
        fontSize=16,
    )

    curr_subplot = alt.Chart(
        ols_df,
        title=chart_title,
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.X(
            Y_var+"_predicted",
            axis=alt.Axis(
                title=["predicted " + Y_var],
                # values=[-4,-3,-2,-1,0,1],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        alt.Y(
            Y_var,
            axis=alt.Axis(
                title=["actual " + Y_var],
                # values=[-3,-2,-1,0],
                # domainWidth=1,
                # domainColor="black",
                # tickColor="black",
            ),
            # scale=alt.Scale(type="symlog")
        ),
        tooltip=[
            "site",
            "wildtype",
            Y_var,
            Y_var+"_predicted",
            "effect on cell entry",
            "escape_377H",
            "escape_2510C",
            "escape_89F",
            "escape_256A",
            "escape_372D",
            "escape_121F",
            "antibody escape",
            "nonsynonymous",
            "FEL_beta",
            "FUBAR_beta",
            # "synonymous",
            # "non/syn",
            # "FEL_beta",
            # "FEL_alpha",
            # "FEL_alpha=beta",
            # "FUBAR_beta",
            # "FUBAR_alpha",
            # "FUBAR_beta-alpha",
        ],
    ).properties(
        width=400,
        height=400,
    )

    subplots.append(curr_subplot)

log_plots = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    spacing=5,
)

combined_plot = alt.vconcat(
    no_log_plots,
    log_plots,
    title=["Correlations of predicted vs actual substitution metrics"],
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

# Make output dir if doesn't exist
if not os.path.exists(out_dir_natural):
    os.mkdir(out_dir_natural)

combined_plot.save(ols_regression)

combined_plot
r correlation of predicted nonsynonymous vs actual nonsynonymous: 0.21
r correlation of predicted FEL_beta vs actual FEL_beta: 0.30
r correlation of predicted FUBAR_beta vs actual FUBAR_beta: 0.29
r correlation of predicted log2 nonsynonymous vs actual log2 nonsynonymous: 0.36
r correlation of predicted log2 FEL_beta vs actual log2 FEL_beta: 0.31
r correlation of predicted log2 FUBAR_beta vs actual log2 FUBAR_beta: 0.34
Out[10]:

Antibody escape compared to natural sequence time of isolation¶

In [11]:
# Load metadata as dataframe
all_metadata = pd.read_csv(natural_seq_metadata, sep="\t")

# Load alignment and metadata info
natural_seqs_df = pd.DataFrame(columns=["strain", "sequence"])

# Load list of sequence names from alignment
strains = []
for curr_fasta in SeqIO.parse(natural_seq_alignment, "fasta"):
    strains.append(str(curr_fasta.id))
    natural_seqs_df.loc[len(natural_seqs_df.index)] = [
        str(curr_fasta.id),
        str(curr_fasta.seq),
    ]

# Filter metadata
GPC_metadata = (
    all_metadata.loc[all_metadata["strain"].isin(strains)].copy()
)
# Add Sierra Leone for Josiah strain and fix date for Josiah
# using paper Wulff and Johnson, 1979
# Note that some dates might be wrong because they were incorrectly
# dated on NCBI virus
GPC_metadata.at[0, "country"] = "Sierra Leone"
GPC_metadata.at[0, "date"] = "1976-XX-XX"
# Also add host for Pinneo strain
GPC_metadata.at[1, "host"] = "Homo sapiens"

# Add sequence data and metadata
natural_seqs_df = (
    natural_seqs_df.merge(
        GPC_metadata,
        how="left",
        on="strain",
        validate="one_to_one",
    )
)

# Create column of collection year
natural_seqs_df["date_year"] = (
    natural_seqs_df["date"].apply(lambda x: x[0:4])
)

# Re-group host into simpler categories
natural_seqs_df["grouped_host"] = (
    natural_seqs_df["host"].apply(lambda x: "human" if x == "Homo sapiens" else ("unknown" if x == "MISSING" else "rodent"))
)

# Extract josiah sequence for comparisons
josiah_seq = (
    natural_seqs_df.loc[
        (natural_seqs_df["strain"] == "Josiah_NC_004296_reverse_complement_2018-08-13")
    ]["sequence"][0]
)

# Read functional score data
functional_scores = pd.read_csv(func_scores)

# Filter for minimum selections, times seen and no stop codons
functional_scores = (
    functional_scores.query(
        "n_selections >= @n_selections and times_seen >= @min_times_seen and mutant != '*'"
    )
    .reset_index(drop=True)
)

# Initialize list of escape files and final df
antibody_files = [
    filtered_escape_377H,
    filtered_escape_89F,
    filtered_escape_2510C,
    filtered_escape_121F,
    filtered_escape_256A,
    filtered_escape_372D,
]
antibody_escape = pd.DataFrame(columns=["site", "wildtype"])

# Add escape to dataframe for each antibody
for index,antibody_file in enumerate(antibody_files):

    antibody_name = antibody_file.split("/")[-1].split("_")[0]

    # Load data as dataframe
    escape_df = pd.read_csv(antibody_file)

    # Clip lower scores to 0
    escape_df["escape_median"] = escape_df["escape_median"].clip(lower=0)

    # Rename escape column to include antibody name
    escape_df = (
        escape_df
        .rename(columns={"escape_median" : "escape_" + antibody_name})
    )

    if index == 0:
        antibody_escape = (
            pd.concat([
                antibody_escape, 
                escape_df[["site", "wildtype", "mutant", "escape_" + antibody_name]],
            ])
        )
    else:
        # Merge dataframes
        antibody_escape = (
            antibody_escape.merge(
                escape_df[["site", "wildtype", "mutant", "escape_" + antibody_name]],
                how="left",
                on=["site", "wildtype", "mutant"],
                validate="one_to_one",
            )
        )

merged_df = (
    functional_scores.merge(
        antibody_escape,
        how="left",
        on=["site", "wildtype", "mutant"],
        validate="one_to_one",
    )
)

# Add column of mutation
merged_df["mutation"] = (
    merged_df["wildtype"] + merged_df["site"].astype(str) + merged_df["mutant"]
)

def calculate_natural_sequence_scores(antibody, sequence, jos_sequence):
    """
    Calculate a cumulative escape or functional 
    effects score for a natural sequence.
    """

    list_of_muts = []
    site = 1
    # Get list of mutations
    for s1, s2 in zip(jos_sequence, sequence):
        if s1 != s2 and s1 != "-" and s2 != "-":
            curr_mut = f"{s1}{site}{s2}"
            list_of_muts.append(curr_mut)
        site += 1

    total_escape = 0
    # Iterate through mutations and get escape scores
    for mut in list_of_muts:
        curr_mut = merged_df.query("mutation == @mut")
        if curr_mut.shape[0] == 1:
            curr_mut_escape = (
                curr_mut
                .fillna(0)
                .reset_index(drop=True)[antibody][0]
            )
            total_escape += curr_mut_escape

    # Return total escape
    return total_escape 

# Calculate antibody escape for individual antibodies
for antibody in ["377H", "256A", "372D", "121F", "89F", "2510C"]:
    natural_seqs_df["total_"+antibody+"_escape"] = (
        natural_seqs_df["sequence"].apply(lambda x: calculate_natural_sequence_scores(
            "escape_"+antibody,
            x,
            josiah_seq
        ))
    )

# Calculate total antibody escape
natural_seqs_df["total_escape"] = (
    natural_seqs_df[[
        "total_377H_escape", 
        "total_256A_escape", 
        "total_372D_escape", 
        "total_121F_escape", 
        "total_89F_escape", 
        "total_2510C_escape"
    ]].sum(axis=1)
)

# Calculate total functional scores
natural_seqs_df["total_func_effect"] = (
    natural_seqs_df["sequence"].apply(lambda x: calculate_natural_sequence_scores(
        "effect",
        x,
        josiah_seq
    ))
)
In [12]:
# Calculate correlation 
r, p = sp.stats.pearsonr(
    natural_seqs_df[["date_year", "total_escape"]].dropna().astype(float)["date_year"],
    natural_seqs_df[["date_year", "total_escape"]].dropna().astype(float)["total_escape"],
)
print(f"r correlation of time and antibody escape: {r:.2f}")

antibody_escape = alt.Chart(
    natural_seqs_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.5,
).encode(
    alt.Y(
        "total_escape",
        axis=alt.Axis(
            title="total antibody escape",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    alt.X(
        "date_year:T",
        axis=alt.Axis(
            title="date",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    color=alt.Color(
        "country:N", 
        title="country",
        scale=alt.Scale(
            domain=natural_seqs_df["country"].unique().tolist(),
            range=tol_muted_adjusted[1:]
        ),
    ), 
    shape=alt.Shape(
        "grouped_host:N",
        title="host",
    ),
    tooltip=[
        "strain",
        "country",
        "host",
        "date",
        "total_escape",
        "total_377H_escape", 
        "total_256A_escape", 
        "total_372D_escape", 
        "total_121F_escape", 
        "total_89F_escape", 
        "total_2510C_escape",
        "total_func_effect",
    ],
).properties(
    width=500,
    height=300,
)

# Calculate correlation 
r, p = sp.stats.pearsonr(
    natural_seqs_df[["date_year", "total_func_effect"]].dropna().astype(float)["date_year"],
    natural_seqs_df[["date_year", "total_func_effect"]].dropna().astype(float)["total_func_effect"],
)
print(f"r correlation of time and functional effects: {r:.2f}")

func_effects = alt.Chart(
    natural_seqs_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.5,
).encode(
    alt.Y(
        "total_func_effect",
        axis=alt.Axis(
            title="total functional effect",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    alt.X(
        "date_year:T",
        axis=alt.Axis(
            title="date",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    color=alt.Color(
        "country:N", 
        title="country",
        scale=alt.Scale(
            domain=natural_seqs_df["country"].unique().tolist(),
            range=tol_muted_adjusted[1:]
        ),
    ), 
    shape=alt.Shape(
        "grouped_host:N",
        title="host",
    ),
    tooltip=[
        "strain",
        "country",
        "host",
        "date",
        "total_escape",
        "total_377H_escape", 
        "total_256A_escape", 
        "total_372D_escape", 
        "total_121F_escape", 
        "total_89F_escape", 
        "total_2510C_escape",
        "total_func_effect",
    ],
).properties(
    width=500,
    height=300,
)

combined_plot = alt.hconcat(
    antibody_escape,
    func_effects,
    spacing=5,
    title="Antibody and functional effects across time",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

combined_plot
r correlation of time and antibody escape: 0.33
r correlation of time and functional effects: 0.05
Out[12]:
In [13]:
antibody_escape = alt.Chart(
    natural_seqs_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.5,
).encode(
    alt.Y(
        "total_escape",
        axis=alt.Axis(
            title="total antibody escape",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    alt.X(
        "grouped_host:N",
        axis=alt.Axis(
            title="host",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    xOffset="jitter:Q",
    color=alt.Color(
        "country:N", 
        title="country",
        scale=alt.Scale(
            domain=natural_seqs_df["country"].unique().tolist(),
            range=tol_muted_adjusted[1:]
        ),
    ), 
    tooltip=[
        "strain",
        "country",
        "host",
        "date",
        "total_escape",
        "total_377H_escape", 
        "total_256A_escape", 
        "total_372D_escape", 
        "total_121F_escape", 
        "total_89F_escape", 
        "total_2510C_escape",
        "total_func_effect",
    ],
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).properties(
    width=500,
    height=300,
)

func_effects = alt.Chart(
    natural_seqs_df, 
).mark_point(
    filled=True, 
    color="black", 
    size=75,
    opacity=0.5,
).encode(
    alt.Y(
        "total_func_effect",
        axis=alt.Axis(
            title="total functional effect",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    alt.X(
        "grouped_host:N",
        axis=alt.Axis(
            title="host",
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
    ),
    xOffset="jitter:Q",
    color=alt.Color(
        "country:N", 
        title="country",
        scale=alt.Scale(
            domain=natural_seqs_df["country"].unique().tolist(),
            range=tol_muted_adjusted[1:]
        ),
    ), 
    tooltip=[
        "strain",
        "country",
        "host",
        "date",
        "total_escape",
        "total_377H_escape", 
        "total_256A_escape", 
        "total_372D_escape", 
        "total_121F_escape", 
        "total_89F_escape", 
        "total_2510C_escape",
        "total_func_effect",
    ],
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).properties(
    width=500,
    height=300,
)

combined_plot = alt.hconcat(
    antibody_escape,
    func_effects,
    spacing=5,
    title="Antibody and functional effects for different hosts",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

combined_plot
Out[13]:
In [14]:
antibody_list = [
    "total_377H_escape", 
    "total_256A_escape", 
    "total_372D_escape", 
    "total_121F_escape", 
    "total_89F_escape", 
    "total_2510C_escape",
]

subplots = []

for antibody in antibody_list:

    curr_title = f"predicted antibody {antibody.split('_')[1]} escape"
    curr_subplot = alt.Chart(
        natural_seqs_df,
        title=curr_title,
    ).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.5,
    ).encode(
        alt.Y(
            antibody,
            axis=alt.Axis(
                title="total antibody escape",
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
        ),
        alt.X(
            "country:N",
            axis=alt.Axis(
                title="Country",
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
        ),
        xOffset="jitter:Q",
        color=alt.Color(
            "country:N", 
            title="country",
            scale=alt.Scale(
                domain=natural_seqs_df["country"].unique().tolist(),
                range=tol_muted_adjusted[1:]
            ),
        ), 
        tooltip=[
            "strain",
            "country",
            "host",
            "date",
            "total_escape",
            "total_377H_escape", 
            "total_256A_escape", 
            "total_372D_escape", 
            "total_121F_escape", 
            "total_89F_escape", 
            "total_2510C_escape",
            "total_func_effect",
        ],
    ).transform_calculate(
        # Generate Gaussian jitter with a Box-Muller transform
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    ).properties(
        width=500,
        height=300,
    )

    subplots.append(curr_subplot)

combined_plot = alt.vconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    subplots[4],
    subplots[5],
    spacing=5,
    title="Individual antibody escape by country",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

combined_plot
Out[14]: